library(chemhelper) #for sim_*(), plot_pca(), plot_plsda(), get_VIP(), and get_loadings()
library(tidyverse)
library(ropls) #for opls(), which does pca and pls-da
library(cowplot) #for plot_grid()
theme_set(theme_gray())
library(iheatmapr) #for correlation heatmaps

No discriminating variables

The code chunk below creates a data frame with a grouping variable with two groups (a and b) with n=10 per group, 5 variables that don’t covary, and two groups of 10 variables that moderately covary.

# myseed <- runif(1, 0, 1000)
myseed <- 401.6368
df <- data_frame(group = rep(c("a", "b"), each = 10))
no_discr <- df %>%
  sim_covar(5, var = 1, cov = 0, name = "uncorr", seed = myseed) %>% 
  sim_covar(10, var = 1, cov = 0.5, name = "corrA", seed = myseed + 1) %>% 
  sim_covar(10, var = 1, cov = 0.5, name = "corrB", seed = myseed + 2)

This heatmap shows the correlation structure.

no_discr %>% 
  select(-group) %>% 
  cor() %>% 
  iheatmap(row_labels = TRUE, col_labels = TRUE, name = "cor")
no_discr.cor <- no_discr %>% 
  select(-group) %>% 
  cor() %>%
  as.data.frame() %>% 
  rownames_to_column(var = "row") %>% 
  gather(-row, key = col, value = cor)
cor1 <- ggplot(no_discr.cor, aes(x = col, y = row, fill = cor)) +
  geom_tile() +
  scale_fill_gradient2("Correlation Coeficient") +
  labs(x = NULL, y = NULL) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90), legend.position = "top")

Then I perform PCA and PLS-DA on this dataset, forcing both models to have two axes (for plotting). The PCA score plot shows no separation and the first two axes explain more than half of the variation in the data. The PLS-DA score plot shows slight separation of the groups (even though there is none). It’s a pretty terrible model with a negative Q2 value and a low R2 value.

pca1.plot <- plot_pca(pca1, no_discr$group, annotate = "subtitle")
pls1.plot <- plot_plsda(pls1, annotate = "subtitle")
plot_grid(cor1,
          pca1.plot + theme(legend.position = "NULL"),
          pls1.plot + theme(legend.position = "NULL"), nrow = 1, ncol = 3)

Few discriminating variables

Keeping the total number of observations and variables the same, and using the same random seed, I create a second data set with 5 variables that do not covary within groups, but differ in their means between groups.

# discr <- df %>% 
#   sim_covar(5, var = 1, cov = 0, name = "uncorr", seed = myseed) %>% 
#   sim_covar(8, var = 1, cov = 0.55, name = "corrA", seed = myseed + 1) %>% 
#   sim_covar(7, var = 1, cov = 0.55, name = "corrB", seed = myseed + 2) %>% 
#   sim_discr(5, var = 1, cov = 0, group_means = c(-1, 1), name = "discr", seed = myseed + 3)
discr <- df %>% 
  sim_covar(10, var = 1, cov = 0.55, name = "corrA", seed = myseed + 1) %>% 
  sim_covar(10, var = 1, cov = 0.55, name = "corrB", seed = myseed + 2) %>% 
  sim_discr(5, var = 1, cov = 0, group_means = c(-1, 1), name = "discr", seed = myseed + 3)

This shows the correlation structure of the data. Because the discriminating variables have different means in the two groups, they are correlated about as strongly as the covarying variables.

discr %>% 
  select(-group) %>% 
  cor() %>% 
  iheatmap(row_labels = TRUE, col_labels = TRUE, name = "cor")
discr.cor <- discr %>% 
  select(-group) %>% 
  cor() %>%
  as.data.frame() %>% 
  rownames_to_column(var = "row") %>% 
  gather(-row, key = col, value = cor)
cor2 <- ggplot(discr.cor, aes(x = col, y = row, fill = cor)) +
  geom_tile() +
  scale_fill_gradient2("Correlation Coeficient") +
  labs(x = NULL, y = NULL) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90), legend.position = "top") +
  annotate(geom = "text", x = 4, y = -1, label = "test")
cor2

The PCA score plot still shows no real separation because the discriminating variables do not contribute much to the total covariation in the data. The PLS-DA plot shows very strong separation, despite the differences between the groups being only due to 5 out of 25 variables. The PLS-DA model performs well, with high R2 and Q2 values

pca2.plot <- plot_pca(pca2, discr$group, annotate = "subtitle")
pls2.plot <- plot_plsda(pls2, annotate = "subtitle")
plot_grid(cor2, pca2.plot + theme(legend.position = "NULL"),
          pls2.plot + theme(legend.position = "NULL"), nrow = 1, ncol = 3)

VIP scores from the PLS-DA model identify all 5 discriminating variables as being important for separating the groups (VIP > 1). Using the VIP > 1 cutoff, it also spurriously identifies two of the other variables as being important. Discriminating variables are not strongly correlated with the first two axes of the PCA.

VIPs <- get_VIP(pls2) %>%
  arrange(desc(VIP))
pca.load <- get_loadings(pca2) %>%
  select(Variable, p1, p2) %>%
  arrange(desc(abs(p1)))

Save plots and tables

fig1 <- plot_grid(cor1,
          pca1.plot + theme(legend.position = "NULL") + labs(title = ""),
          pls1.plot + theme(legend.position = "NULL") + labs(title = ""),
          cor2,
          pca2.plot + theme(legend.position = "NULL") + labs(title = ""),
          pls2.plot + theme(legend.position = "NULL") + labs(title = ""),
          ncol = 3, nrow = 2, labels = "AUTO")
save_plot("figs/fig1.png", fig1, ncol = 3, nrow = 2)
table1 <- full_join(VIPs, pca.load) %>%
  arrange(Variable) %>% 
  rename(`PLS-DA VIP score` = VIP, `PC1 loading` = p1, `PC2 loading` = p2) %>% 
  mutate_if(is.double, ~round(.,4))
Joining, by = "Variable"
table1
write_excel_csv(table1, "figs/table1.csv")

Red Herring

Large variation with weak discrimination plus small variation with large discrimination

herring <- df %>% 
  sim_discr(p = 10, var = 1, cov = 0.5, group_means = c(-0.5, 0.5), name = "highvar") %>% 
  sim_discr(p = 10, var = 1, cov = 0, group_means = c(-1, 1), name = "lowvar") %>% 
  sim_covar(p = 5, var = 1, cov = 0, name = "noise")
herring %>% 
  select(-group) %>% 
  cor() %>% 
  iheatmap(row_labels = TRUE, col_labels = TRUE, name = "cor")
herring.cor <- herring %>% 
  select(-group) %>% 
  cor() %>%
  as.data.frame() %>% 
  rownames_to_column(var = "row") %>% 
  gather(-row, key = col, value = cor)
cor3 <- ggplot(herring.cor, aes(x = col, y = row, fill = cor)) +
  geom_tile() +
  scale_fill_gradient2("Correlation Coeficient") +
  labs(x = NULL, y = NULL) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90), legend.position = "top")
pca3 <- opls(select(herring, -group), plot = FALSE)
PCA
20 samples x 25 variables
standard scaling of predictors
pls3 <- opls(select(herring, -group), no_discr$group, plot = FALSE, predI = 2, permI = 500)
PLS-DA
20 samples x 25 variables and 1 response
standard scaling of predictors and response(s)
pca3.plot <- plot_pca(pca3, herring$group, annotate = "subtitle")
pls3.plot <- plot_plsda(pls3, annotate = "subtitle")
Joining, by = "sample"
plot_grid(cor3,
          pca3.plot + theme(legend.position = "NULL"),
          pls3.plot + theme(legend.position = "NULL"), nrow = 1, ncol = 3)

VIPs3 <- get_VIP(pls3) %>%
  arrange(desc(VIP))
pca3.load <- get_loadings(pca3) %>%
  select(Variable, p1, p2) %>%
  arrange(desc(abs(p1)))
table3 <- full_join(VIPs3, pca3.load) %>%
  arrange(Variable) %>% 
  rename(`PLS-DA VIP score` = VIP, `PC1 loading` = p1, `PC2 loading` = p2) %>% 
  mutate_if(is.double, ~round(.,4))
Joining, by = "Variable"
table3 %>% arrange(desc(`PLS-DA VIP score`))
table3 %>% arrange(desc(`PC1 loading`))
# write_excel_csv(table1, "figs/table1.csv")
---
title: "PCA vs. PLS with no and few discriminating variables"
author: "Eric R. Scott"
output: html_notebook
---
```{r message=FALSE, warning=FALSE}
library(chemhelper) #for sim_*(), plot_pca(), plot_plsda(), get_VIP(), and get_loadings()
library(tidyverse)
library(ropls) #for opls(), which does pca and pls-da
library(cowplot) #for plot_grid()
theme_set(theme_gray())
library(iheatmapr) #for correlation heatmaps
```

# No discriminating variables

The code chunk below creates a data frame with a grouping variable with two groups (a and b) with n=10 per group, 5 variables that don't covary, and two groups of 10 variables that moderately covary.

```{r}
# myseed <- runif(1, 0, 1000)
myseed <- 401.6368

df <- data_frame(group = rep(c("a", "b"), each = 10))
no_discr <- df %>%
  sim_covar(5, var = 1, cov = 0, name = "uncorr", seed = myseed) %>% 
  sim_covar(10, var = 1, cov = 0.5, name = "corrA", seed = myseed + 1) %>% 
  sim_covar(10, var = 1, cov = 0.5, name = "corrB", seed = myseed + 2)
```

This heatmap shows the correlation structure.

```{r}
no_discr %>% 
  select(-group) %>% 
  cor() %>% 
  iheatmap(row_labels = TRUE, col_labels = TRUE, name = "cor")
```
```{r}
no_discr.cor <- no_discr %>% 
  select(-group) %>% 
  cor() %>%
  as.data.frame() %>% 
  rownames_to_column(var = "row") %>% 
  gather(-row, key = col, value = cor)

cor1 <- ggplot(no_discr.cor, aes(x = col, y = row, fill = cor)) +
  geom_tile() +
  scale_fill_gradient2("Correlation Coeficient") +
  labs(x = NULL, y = NULL) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90), legend.position = "top")
```

Then I perform PCA and PLS-DA on this dataset, forcing both models to have two axes (for plotting).  The PCA score plot shows no separation and the first two axes explain more than half of the variation in the data.  The PLS-DA score plot shows slight separation of the groups (even though there is none).  It's a pretty terrible model with a negative Q^2^ value and a low R^2^ value.

```{r include=FALSE}
pca1 <- opls(select(no_discr, -group), plot = FALSE)
pls1 <- opls(select(no_discr, -group), no_discr$group, plot = FALSE, predI = 2, permI = 500)
```

```{r message=FALSE, warning=FALSE}
pca1.plot <- plot_pca(pca1, no_discr$group, annotate = "subtitle")
pls1.plot <- plot_plsda(pls1, annotate = "subtitle")
plot_grid(cor1,
          pca1.plot + theme(legend.position = "NULL"),
          pls1.plot + theme(legend.position = "NULL"), nrow = 1, ncol = 3)
```

# Few discriminating variables

Keeping the total number of observations and variables the same, and using the same random seed, I create a second data set with 5 variables that do not covary within groups, but differ in their means between groups.

```{r}
# discr <- df %>% 
#   sim_covar(5, var = 1, cov = 0, name = "uncorr", seed = myseed) %>% 
#   sim_covar(8, var = 1, cov = 0.55, name = "corrA", seed = myseed + 1) %>% 
#   sim_covar(7, var = 1, cov = 0.55, name = "corrB", seed = myseed + 2) %>% 
#   sim_discr(5, var = 1, cov = 0, group_means = c(-1, 1), name = "discr", seed = myseed + 3)
discr <- df %>% 
  sim_covar(10, var = 1, cov = 0.55, name = "corrA", seed = myseed + 1) %>% 
  sim_covar(10, var = 1, cov = 0.55, name = "corrB", seed = myseed + 2) %>% 
  sim_discr(5, var = 1, cov = 0, group_means = c(-1, 1), name = "discr", seed = myseed + 3)
```

This shows the correlation structure of the data.  Because the discriminating variables have different means in the two groups, they are correlated about as strongly as the covarying variables.

```{r}
discr %>% 
  select(-group) %>% 
  cor() %>% 
  iheatmap(row_labels = TRUE, col_labels = TRUE, name = "cor")
```
```{r}
discr.cor <- discr %>% 
  select(-group) %>% 
  cor() %>%
  as.data.frame() %>% 
  rownames_to_column(var = "row") %>% 
  gather(-row, key = col, value = cor)

cor2 <- ggplot(discr.cor, aes(x = col, y = row, fill = cor)) +
  geom_tile() +
  scale_fill_gradient2("Correlation Coeficient") +
  labs(x = NULL, y = NULL) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90), legend.position = "top") +
  annotate(geom = "text", x = 4, y = -1, label = "test")
cor2
```
The PCA score plot still shows no real separation because the discriminating variables do not contribute much to the total covariation in the data.  The PLS-DA plot shows very strong separation, despite the differences between the groups being only due to 5 out of 25 variables.  The PLS-DA model performs well, with high R^2^ and Q^2^ values

```{r include=FALSE}
pca2 <- opls(select(discr, -group), plot = FALSE)
pls2 <- opls(select(discr, -group), discr$group, plot = FALSE, predI = 2, permI = 500)
```

```{r message=FALSE, warning=FALSE}
pca2.plot <- plot_pca(pca2, discr$group, annotate = "subtitle")
pls2.plot <- plot_plsda(pls2, annotate = "subtitle")
plot_grid(cor2, pca2.plot + theme(legend.position = "NULL"),
          pls2.plot + theme(legend.position = "NULL"), nrow = 1, ncol = 3)
```

VIP scores from the PLS-DA model identify all 5 discriminating variables as being important for separating the groups (VIP > 1).  Using the VIP > 1 cutoff, it also spurriously identifies two of the other variables as being important.  Discriminating variables are not strongly correlated with the first two axes of the PCA.

```{r}
VIPs <- get_VIP(pls2) %>%
  arrange(desc(VIP))

pca.load <- get_loadings(pca2) %>%
  select(Variable, p1, p2) %>%
  arrange(desc(abs(p1)))
```

# Save plots and tables

```{r}
fig1 <- plot_grid(cor1,
          pca1.plot + theme(legend.position = "NULL") + labs(title = ""),
          pls1.plot + theme(legend.position = "NULL") + labs(title = ""),
          cor2,
          pca2.plot + theme(legend.position = "NULL") + labs(title = ""),
          pls2.plot + theme(legend.position = "NULL") + labs(title = ""),
          ncol = 3, nrow = 2, labels = "AUTO")
save_plot("figs/fig1.png", fig1, ncol = 3, nrow = 2)
```

```{r}
table1 <- full_join(VIPs, pca.load) %>%
  arrange(Variable) %>% 
  rename(`PLS-DA VIP score` = VIP, `PC1 loading` = p1, `PC2 loading` = p2) %>% 
  mutate_if(is.double, ~round(.,4))
table1
write_excel_csv(table1, "figs/table1.csv")
```

# Red Herring
Large variation with weak discrimination plus small variation with large discrimination

```{r}
herring <- df %>% 
  sim_discr(p = 10, var = 1, cov = 0.5, group_means = c(-0.5, 0.5), name = "highvar") %>% 
  sim_discr(p = 10, var = 1, cov = 0, group_means = c(-1, 1), name = "lowvar") %>% 
  sim_covar(p = 5, var = 1, cov = 0, name = "noise")
```

```{r}
herring %>% 
  select(-group) %>% 
  cor() %>% 
  iheatmap(row_labels = TRUE, col_labels = TRUE, name = "cor")
```
```{r}
herring.cor <- herring %>% 
  select(-group) %>% 
  cor() %>%
  as.data.frame() %>% 
  rownames_to_column(var = "row") %>% 
  gather(-row, key = col, value = cor)

cor3 <- ggplot(herring.cor, aes(x = col, y = row, fill = cor)) +
  geom_tile() +
  scale_fill_gradient2("Correlation Coeficient") +
  labs(x = NULL, y = NULL) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 90), legend.position = "top")
```


```{r}
pca3 <- opls(select(herring, -group), plot = FALSE)
pls3 <- opls(select(herring, -group), no_discr$group, plot = FALSE, predI = 2, permI = 500)
```
```{r}
pca3.plot <- plot_pca(pca3, herring$group, annotate = "subtitle")
pls3.plot <- plot_plsda(pls3, annotate = "subtitle")
plot_grid(cor3,
          pca3.plot + theme(legend.position = "NULL"),
          pls3.plot + theme(legend.position = "NULL"), nrow = 1, ncol = 3)
```

```{r}
VIPs3 <- get_VIP(pls3) %>%
  arrange(desc(VIP))

pca3.load <- get_loadings(pca3) %>%
  select(Variable, p1, p2) %>%
  arrange(desc(abs(p1)))

table3 <- full_join(VIPs3, pca3.load) %>%
  arrange(Variable) %>% 
  rename(`PLS-DA VIP score` = VIP, `PC1 loading` = p1, `PC2 loading` = p2) %>% 
  mutate_if(is.double, ~round(.,4))
table3 %>% arrange(desc(`PLS-DA VIP score`))
table3 %>% arrange(desc(`PC1 loading`))
# write_excel_csv(table1, "figs/table1.csv")

```

